Project Overview

Around the world, people are spending an increasing amount of time on their mobile devices for email, social networking, banking and a whole range of other activities. But typing on mobile devices can be a serious pain. SwiftKey, our corporate partner in this capstone, builds a smart keyboard that makes it easier for people to type on their mobile devices. One cornerstone of their smart keyboard is predictive text models. When someone types:

I went to the

the keyboard presents three options for what the next word might be. For example, the three words might be gym, store, restaurant. In this capstone you will work on understanding and building predictive text models like those used by SwiftKey.

Goal

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.

Getting and cleaning the data

Load Lirbrary

library(knitr)
library(stringr)
library(dplyr)
library(quanteda)
library(readtext)
library(R.utils)
library(ggplot2)

set.seed(3301)

Download Data

downloadData <- function(workingDataPath = file.path("data")) {
        # Download Data
        rawDataFileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
        downloadedZipfilePath <- file.path(workingDataPath, "Coursera-SwiftKey.zip")
        badWordsFileUrl <- "https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
        badWordsFilePath <- file.path(workingDataPath, "bad-words.txt")
        basePath <- file.path(workingDataPath, "final", "en_US")
        
        # Create working directory
        if(!file.exists(workingDataPath)) {
                dir.create(workingDataPath)
        }
        # Download ziped file
        if(!file.exists(downloadedZipfilePath)) {
                download.file(rawDataFileUrl, destfile = downloadedZipfilePath, method = "curl")
        }
        # Download Bad Words Text File
        if(!file.exists(badWordsFilePath)) {
                download.file(badWordsFileUrl, destfile = badWordsFilePath, method = "curl")
        }
        # Unzip
        if(!file.exists(basePath)) {
                unzip(zipfile = downloadedZipfilePath, exdir = workingDataPath)
        }
        
        list(blogs = file.path(basePath, "en_US.blogs.txt"), 
             twitter = file.path(basePath, "en_US.twitter.txt"),
             news = file.path(basePath, "en_US.news.txt"),
             badwords = badWordsFilePath)
}

dataPath <- file.path("..", "data")
attach(downloadData(dataPath))
c(blogs, twitter, news, badwords)
## [1] "../data/final/en_US/en_US.blogs.txt"  
## [2] "../data/final/en_US/en_US.twitter.txt"
## [3] "../data/final/en_US/en_US.news.txt"   
## [4] "../data/bad-words.txt"

Sub-Sampling

Since the original data is very large and takes time to analyse, it samples to 1% of data.

subSample <- function(input, output) {
        if(!file.exists(output)) {
                subSamplingRate <- .01
                fileLines <- as.numeric(countLines(input))
                flipABiasedCoin <- rbinom(fileLines, size = 1, prob = subSamplingRate)
                conRead <- file(input, "r")
                conWrite <- file(output, "w")
                len <- 0
                while (length(oneLine <- readLines(conRead, 1, warn = FALSE)) > 0) {
                        len <- len + 1
                        if(flipABiasedCoin[len] == 1) {
                                writeLines(oneLine, conWrite)
                        }
                }
                close(conRead)
                close(conWrite)
        }
        return(as.numeric(countLines(output)))
}

blogsSubSampling <- file.path(dataPath, "sub-sample.blogs.txt")
subBlogs <- subSample(blogs, blogsSubSampling)

twitterSubSampling <- file.path(dataPath, "sub-sample.twitter.txt")
subTweets <- subSample(twitter, twitterSubSampling)

newsSubSampling <- file.path(dataPath, "sub-sample.news.txt")
subNews <- subSample(news, newsSubSampling)

c(subBlogs, subTweets, subNews)
## [1] 23673 23662 10161

Train and Test Data

Divide the sampled data into a training set and a test set at a ratio of 7: 3.

devideDataset <- function(input, outputTrain, outputTest) {
        if(!file.exists(outputTrain) || !file.exists(outputTest)) {
                trainRate <- .7
                fileLines <- as.numeric(countLines(input))
                flipABiasedCoin <- rbinom(fileLines, size = 1, prob = trainRate)
                conRead <- file(input, "r")
                conWriteTrain <- file(outputTrain, "w")
                conWriteTest <- file(outputTest, "w")
                len <- 0
                while (length(oneLine <- readLines(conRead, 1, warn = FALSE)) > 0) {
                        len <- len + 1
                        if(flipABiasedCoin[len] == 1) {
                                writeLines(oneLine, conWriteTrain)
                        } else {
                                writeLines(oneLine, conWriteTest)
                        }
                }
                close(conRead)
                close(conWriteTrain)
                close(conWriteTest)
        }
        return(as.numeric(countLines(outputTrain)))
}

blogsTrain <- paste0(blogsSubSampling, ".train.txt")
blogsTest <- paste0(blogsSubSampling, ".test.txt")
twitterTrain <- paste0(twitterSubSampling, ".train.txt")
twitterTest <- paste0(twitterSubSampling, ".test.txt")
newsTrain <- paste0(newsSubSampling, ".train.txt")
newsTest <- paste0(newsSubSampling, ".test.txt")

trainBlogs <- devideDataset(blogsSubSampling, blogsTrain, blogsTest)
trainTweets <- devideDataset(twitterSubSampling, twitterTrain, twitterTest)
trainNews <- devideDataset(newsSubSampling, newsTrain, newsTest)

c(trainBlogs, trainTweets, trainNews)
## [1] 16598 16525  7185

Load Data

Loading files using the readtext package and make corpus using the quanteda package.

blogsCorpus <- readtext(blogsTrain) %>% 
        corpus()
twitterCorpus <- readtext(twitterTrain) %>% 
        corpus()
newsCorpus <- readtext(newsTrain) %>% 
        corpus()

Load Bad Words

profanity <- readLines(badwords)

Exploratory Data Analysis

Text Lines Sentences Tokens Types
sub-sample.blogs.txt.train.txt 16598 18034 256841 27642
sub-sample.twitter.txt.train.txt 16525 18175 256447 27506
sub-sample.news.txt.train.txt 7185 13325 285258 31187

Top 20

twitterToken <- twitterCorpus %>% 
        # nomarize words
        tokens(remove_punct = TRUE,
               remove_numbers = TRUE,
               remove_url = TRUE,
               include_docvars = FALSE) %>%
        # removing profanity and other words
        tokens_remove(profanity)

twitterDfm <- twitterToken %>% 
        dfm()
twitterDfm %>%
        topfeatures(20)
##  the   to    i    a  you  and  for   in   of   is   it   my   on that   me 
## 6424 5335 5019 4284 3798 3073 2681 2585 2566 2458 2152 2117 1970 1603 1478 
##   at   be with your have 
## 1279 1249 1207 1197 1190

Plot Word Cloud

twitterDfm %>% 
        textplot_wordcloud(min_count = 6,
                           random_order = FALSE, 
                           rotation = .25,
                           color = RColorBrewer::brewer.pal(8, "Dark2"))

Nomarize Words (Remove Stop Word)

twitterDfmNoStopWord <- twitterToken %>% 
        # removing stop words
        tokens_remove(stopwords('english')) %>%
        dfm()

Top 20 Nomarized Words

twitterDfmNoStopWord %>%
        topfeatures(20)
##   just   like    get   good   love    day    can thanks    now     rt 
##   1049    849    809    734    709    648    633    610    610    595 
##    one   know   time      u    new  great  today     go    lol    see 
##    579    557    543    537    533    519    518    476    463    460

Plot Normarized Word Cloud

twitterDfmNoStopWord %>%
        textplot_wordcloud(min_count = 6,
                           random_order = FALSE,
                           max_words = 100,
                           rotation = .25,
                           color = RColorBrewer::brewer.pal(8, "Dark2"))

Normarized Word Frequency Plots

featuresTwitter <- twitterDfmNoStopWord %>%
        textstat_frequency(n = 80)

# Sort by reverse frequency order
featuresTwitter$feature <- featuresTwitter %>% 
        with(reorder(feature, -frequency))

featuresTwitter %>%
        ggplot(aes(x = feature, y = frequency)) +
        geom_point() + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1))

2-Gram Top 20.

twitterDfm2Gram <- twitterToken %>%
        tokens_ngrams(n = 2) %>%
        dfm()
twitterDfm2Gram %>%
        topfeatures(20)
##     in_the    for_the     of_the     on_the      to_be thanks_for 
##        531        495        430        343        298        288 
##     to_the     at_the     if_you  thank_you     i_love       i_am 
##        281        262        249        243        241        227 
##     i_have     have_a      for_a   going_to      i_was    will_be 
##        223        220        215        201        187        183 
##     to_see    want_to 
##        181        181

2-Gram Word Cloud

twitterDfm2Gram %>%
        textplot_wordcloud(min_count = 6,
                           random_order = FALSE,
                           max_words = 100,
                           rotation = .25,
                           color = RColorBrewer::brewer.pal(8, "Dark2"))

2-Gram Frequency Plots

featuresTwitter2Gram <- twitterDfm2Gram %>%
        textstat_frequency(n = 80)

# Sort by reverse frequency order
featuresTwitter2Gram$feature <- featuresTwitter2Gram %>%
        with(reorder(feature, -frequency))

featuresTwitter2Gram %>%
        ggplot(aes(x = feature, y = frequency)) +
        geom_point() + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1))

3-Gram Top 20

twitterDfm3Gram <- twitterToken %>%
        tokens_ngrams(n = 3) %>%
        dfm()
twitterDfm3Gram %>%
        topfeatures(20)
##     thanks_for_the      thank_you_for         i_love_you 
##                151                 62                 60 
## looking_forward_to      can't_wait_to          i_want_to 
##                 59                 54                 53 
##         one_of_the     for_the_follow        going_to_be 
##                 51                 49                 45 
##           a_lot_of       have_a_great          i_need_to 
##                 45                 42                 40 
##         to_see_you       i_don't_know          i_have_to 
##                 38                 37                 37 
##        is_going_to            to_be_a        let_me_know 
##                 34                 34                 34 
##        you_want_to         you_have_a 
##                 34                 33

3-Gram Word Cloud

twitterDfm3Gram %>%
        textplot_wordcloud(#min_count = 4,
                           random_order = FALSE,
                           max_words = 50,
                           rotation = .25,
                           color = RColorBrewer::brewer.pal(8, "Dark2"))

3-Gram Frequency Plots

featuresTwitter3Gram <- twitterDfm3Gram %>%
        textstat_frequency(60)

# Sort by reverse frequency order
featuresTwitter3Gram$feature <- featuresTwitter3Gram %>% 
        with(reorder(feature, -frequency))

featuresTwitter3Gram %>%
        ggplot(aes(x = feature, y = frequency)) +
        geom_point() +
        theme(axis.text.x = element_text(angle = 90, hjust = 1))

1-gram 90%tile:

featuresTwitterFull <- twitterDfmNoStopWord %>% textstat_frequency()
summary(featuresTwitterFull)
##    feature            frequency             rank          docfreq 
##  Length:20671       Min.   :   1.000   Min.   :    1   Min.   :1  
##  Class :character   1st Qu.:   1.000   1st Qu.: 5168   1st Qu.:1  
##  Mode  :character   Median :   1.000   Median :10336   Median :1  
##                     Mean   :   5.565   Mean   :10336   Mean   :1  
##                     3rd Qu.:   3.000   3rd Qu.:15504   3rd Qu.:1  
##                     Max.   :1049.000   Max.   :20671   Max.   :1  
##     group          
##  Length:20671      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
quantile(featuresTwitterFull$frequency, c(0, .1, .5, .9, 1))
##   0%  10%  50%  90% 100% 
##    1    1    1    8 1049

2-gram 90%tile:

featuresTwitter2GramFull <- twitterDfm2Gram %>% textstat_frequency()
summary(featuresTwitter2GramFull)
##    feature            frequency            rank           docfreq 
##  Length:119334      Min.   :  1.000   Min.   :     1   Min.   :1  
##  Class :character   1st Qu.:  1.000   1st Qu.: 29834   1st Qu.:1  
##  Mode  :character   Median :  1.000   Median : 59668   Median :1  
##                     Mean   :  1.706   Mean   : 59668   Mean   :1  
##                     3rd Qu.:  1.000   3rd Qu.: 89501   3rd Qu.:1  
##                     Max.   :531.000   Max.   :119334   Max.   :1  
##     group          
##  Length:119334     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
quantile(featuresTwitter2GramFull$frequency, c(0, .1, .5, .9, 1))
##   0%  10%  50%  90% 100% 
##    1    1    1    2  531

3-gram 90%tile:

featuresTwitter3GramFull <- twitterDfm3Gram %>% textstat_frequency()
summary(featuresTwitter3GramFull)
##    feature            frequency            rank           docfreq 
##  Length:184636      Min.   :  1.000   Min.   :     1   Min.   :1  
##  Class :character   1st Qu.:  1.000   1st Qu.: 46160   1st Qu.:1  
##  Mode  :character   Median :  1.000   Median : 92318   Median :1  
##                     Mean   :  1.103   Mean   : 92318   Mean   :1  
##                     3rd Qu.:  1.000   3rd Qu.:138477   3rd Qu.:1  
##                     Max.   :151.000   Max.   :184636   Max.   :1  
##     group          
##  Length:184636     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
quantile(featuresTwitter3GramFull$frequency, c(0, .1, .5, .9, 1))
##   0%  10%  50%  90% 100% 
##    1    1    1    1  151

Seems to be in accordance with the Zipf’s law.

Modeling

Concept implementation.

Good Turing Estimation

simpleGoodTuring <- function(r, Nr, sd = 1.65) {
        # number of words
        N <- sum(r * Nr)
        d <- diff(r)
    
        ## Turing estimate
        # turing estimate index
        ti <- which(d == 1)
        # discount coefficients of Turing estimate
        dct <- numeric(length(r))
        dct[ti] <- (r[ti] + 1) / r[ti] * c(Nr[-1], 0)[ti] / Nr[ti]

        ## Linear Good-Turing estimate
        Zr <- Nr / c(1, 0.5 * (d[-1] + d[-length(d)]), d[length(d)])
        f <- lsfit(log(r), log(Zr))
        coef <- f$coef
        # corrected term frequency
        rc <- r * (1 + 1 / r)^(1 + coef[2])  
        # discount coefficients of Linear Good-Turing estimate
        dclgt <- rc / r

        ## make switch from Turing to LGT estimates
        # standard deviation of term frequencies between 'r' and 'rc' (?)
        rsd <- rep(1,length(r))        
        rsd[ti] <- (seq_len(length(r))[ti] + 1) / Nr[ti] * sqrt(Nr[ti + 1] * (1 + Nr[ti + 1] / Nr[ti]))
        
        dc <- dct
        for (i in 1:length(r)) {
            if (abs(dct[i] - dclgt[i]) * r[i] / rsd[i] <= sd) {
                dc[i:length(dc)] <- dclgt[i:length(dc)]
                break
            }
        }

        ## renormalize the probabilities for observed objects
        # summation of probabilities
        sump <- sum(dc * r * Nr) / N
        # renormalized discount coefficients
        dcr <- (1 - Nr[1] / N) * dc / sump
        
        # term frequency
        tf <- c(Nr[1] / N, r * dcr)
        p <- c(Nr[1] / N, r * dcr / N)
        names(p) <- names(tf) <- c(0, r)        
        
        list(p = p, r = tf)
}

PreProcess Smoothing parameter

sgtFactory <- function() {
        NrTbl1 <- textstat_frequency(twitterDfm) %>%
                select(frequency) %>%
                mutate(freqOfFrequency = 1) %>%
                group_by(frequency) %>%
                summarise_all(sum)
        
        SGT1 <- simpleGoodTuring(NrTbl1$frequency, NrTbl1$freqOfFrequency)
        
        NrTbl2 <- textstat_frequency(twitterDfm2Gram) %>%
                select(frequency) %>%
                mutate(freqOfFrequency = 1) %>%
                group_by(frequency) %>%
                summarise_all(sum)
        
        SGT2 <- simpleGoodTuring(NrTbl2$frequency, NrTbl2$freqOfFrequency)

        NrTbl3 <- textstat_frequency(twitterDfm3Gram) %>%
                select(frequency) %>%
                mutate(freqOfFrequency = 1) %>%
                group_by(frequency) %>%
                summarise_all(sum)
        
        SGT3 <- simpleGoodTuring(NrTbl3$frequency, NrTbl3$freqOfFrequency)
        
        c(
                dUnigram = function(freq) {
                        SGT1$p[as.character(freq)]
                },
                dBigram = function(freq) {
                        SGT2$r[as.character(freq)] / freq
                },
                dTrigram = function(freq) {
                        SGT3$r[as.character(freq)] / freq
                }
        )
}

SGT <- sgtFactory()

Parameters using our predict model

predictModel <- c(SGT = SGT, 
                  trigram = twitterDfm3Gram,
                  bigram = twitterDfm2Gram,
                  unigram = twitterDfm)
# save(predictModel, file = "predictModel.rda")

3-gram Model using Katz’s back-off

nextWords <- function(input, predictModel, outputs = 3, k = 0) {
        # k is the least important of the parameters. It is usually chosen to be 0.
        # However, empirical testing may find better values for k.
        inputs <- str_split(input, "\\s+")[[1]]
        inputsSize <- length(inputs)
        if (inputsSize > 1) {
                preTriGram <- paste(inputs[inputsSize - 1],
                                 inputs[inputsSize],
                                 sep = "_")

                nextWordDfm <- dfm_select(predictModel$trigram, 
                                          paste0(preTriGram, "_*"))
        } else {
                if (inputs == "") { return() }
                nextWordDfm <- NULL
        }
        preBiGram <- inputs[inputsSize]

        # extract n-gram that starts with input
        featuresNextWord <- NULL

        if (length(nextWordDfm) > k) {
                prevWordDfm <- dfm_select(predictModel$bigram,
                                          phrase(preTriGram))

                prevWordFreq <- textstat_frequency(prevWordDfm)$frequency

                # data frame
                featuresNextWord <-
                        textstat_frequency(nextWordDfm) %>%
                        mutate(p_bo = predictModel$SGT.dTrigram(frequency) * frequency / prevWordFreq)

                # human readable outputs
                featuresNextWord$feature <-
                        sapply(as.vector(featuresNextWord$feature),
                               function(x) {
                                       str_split(x, "_")[[1]][3]
                               })
                
                # Sort by reverse frequency order
                featuresNextWord$feature <-
                        with(featuresNextWord,
                             reorder(feature,-p_bo))
                
        } else {
                nextWordDfm <- dfm_select(predictModel$bigram,
                                          paste0(preBiGram, "_*"))
                
                if (length(nextWordDfm) > k) {
                        prevWordDfm <- dfm_select(predictModel$unigram,
                                                  phrase(preBiGram))
                        prevWordFreq <- textstat_frequency(prevWordDfm)$frequency
        
                        # data frame
                        featuresNextWord <-
                                textstat_frequency(nextWordDfm) %>%
                                mutate(p_bo = predictModel$SGT.dBigram(frequency) * frequency / prevWordFreq)
        
                        # human readable outputs
                        featuresNextWord$feature <-
                                sapply(as.vector(featuresNextWord$feature),
                                       function(x) {
                                               str_split(x, "_")[[1]][2]
                                       })
                        
                        # Sort by reverse frequency order
                        featuresNextWord$feature <-
                                with(featuresNextWord,
                                     reorder(feature,-p_bo))
                        
                } else {
                        nextWordDfm <- twitterDfm
                        featuresNextWord <-
                                textstat_frequency(nextWordDfm) %>%
                                mutate(p_bo = predictModel$SGT.dUnigram(frequency))
        
                        # Sort by reverse frequency order
                        featuresNextWord$feature <-
                                with(featuresNextWord,
                                     reorder(feature,-p_bo))
                                
                }
        }
        
        featuresNextWord %>% slice(1:outputs)
}

I went to be

# load("predictModel.rda")
ggplot(nextWords("I went to be", predictModel), aes(x = feature, y = p_bo)) +
        geom_bar(stat = "identity") + 
        xlab("Predicted next word") + ylab("P_bo")

system.time(nextWords("I went to be", predictModel))
##    user  system elapsed 
##   2.813   0.152   2.997

Conclution and Next Action

The result of executing the implemented model is as described above. I have not verified the prediction accuracy, but I think that the prediction works well. It takes a long time to predict the next word. So, next, tune the execution time and prediction accuracy.

References